There is a plethora of streaming services available for anyone with a credit card handy. Hulu and Netflix are general entertainment providers offering licensed content and originals for subscribers. Others, like anime provider Crunchyroll and NFL Sunday Ticket, are targeted at specific interests. Streaming services like Disney Plus and Apple TV Plus are flashy newcomers that are trying to find their place in an overly crowded space.
**Targets for the Analysis**
**Data Source**
**1.Importing Packages**
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
from plotly import graph_objects as go
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.express as px
from ipywidgets import widgets
import warnings
warnings.filterwarnings('ignore')
**2.Reading Dataset**
mov = pd.read_csv('Movies.csv')
tv = pd.read_csv('tv_shows.csv')
mov.head()
#shape of movies df
mov.shape
#columns in movies df
mov.columns
tv.head()
#shape of tv shows df
tv.shape
#a summary on movies dataset
mov.describe()
#a summary on tv shows dataset
tv.describe()
**3. Data Cleaning**
#Dropping Unnamed and ID columns
mov = mov.drop(['Unnamed: 0'], axis = 1)
mov.head()
#Dropping Unnamed columns
tv = tv.drop(['Unnamed: 0'], axis = 1)
tv.head()
#adding ID column for TV shows df
tv["ID"] = tv.index + 1
tv.head()
**4. Data Visualization**
TV Shows and Movies Distribution Accross Platforms
def summ(df,b):
return df[b].sum(axis=0)
#Counting the number of movies and Tv shows in each platform
counts = []
df = [mov,tv]
cols = ['Netflix','Hulu','Prime Video','Disney+']
for x in df:
for y in cols:
counts.append(summ(x,y))
counts
#Setting Default values for the subplots
def pieplot(i,df,portion,title):
'''Function to set default values to plot Platform
Movie Distribution for Movies and TV shows'''
plt.subplot(i)
plt.pie(portion, explode=explode, labels=labels, colors=colors, shadow = True, autopct='%1.1f%%')
fig = plt.gcf()
plt.title(title)
plt.axis('equal')
#increase font size
import matplotlib as mpl
mpl.rcParams['font.size'] = 15.0
#plotting
fig = plt.subplots(figsize=(17, 10))
labels = 'Netflix', 'Hulu','Prime Video','Disney+'
portion_m = [counts[0], counts[1],counts[2],counts[3]]
portion_t = [counts[4], counts[5],counts[6],counts[7]]
colors = ['r', 'springgreen', 'deepskyblue', 'darkblue']
explode = (0.1, 0, 0, 0)
pieplot(121,mov,portion_m,'Movies')
pieplot(122,tv,portion_t,'TV shows')
plt.show()
Movie Count by Country and Language
Aggregating Movies dataset by language and counting the titles in each language
movie_count_by_language = mov.groupby('Language')['Title'].count().reset_index().sort_values('Title',ascending = False).head(10).rename(columns = {'Title':'Movie Count'})
fig = px.bar(movie_count_by_language, x='Language', y='Movie Count', color='Movie Count', title = 'Movie Count by Language', height=500)
fig.show()
Group movies dataset by country and counting titles in each country category
movies_by_country = mov.groupby('Country')['Title'].count().reset_index().sort_values('Title',ascending = False).head(10).rename(columns = {'Title':'MovieCount'})
fig = px.pie(movies_by_country,names='Country', values='MovieCount')
fig.update_traces(rotation=180, pull=[0.1,0.03,0.03,0.03,0.03],textinfo="percent")
fig.update_layout(showlegend=True, title_text = 'Movie Count by Country',font=dict(
family="Courier New, monospace",
size=18,
color="black"))
#fig.update_layout(
#font_family="Courier New",
#font_color="black",
#title_font_family="Times New Roman",
#title_font_color="blue",
#legend_title_font_color="black")
fig.show()
Movie Age Rating On Each Platform
#Counting movies in each age category
age_netflix = mov[mov.Netflix == 1].groupby(['Age', 'Netflix']).count()['ID'].reset_index()[['Age', 'ID']]
age_hulu = mov[mov.Hulu == 1].groupby(['Age', 'Hulu']).count()['ID'].reset_index()[['Age', 'ID']]
age_prime = mov[mov['Prime Video'] == 1].groupby(['Age', 'Prime Video']).count()['ID'].reset_index()[['Age', 'ID']]
age_disney = mov[mov['Disney+'] == 1].groupby(['Age', 'Disney+']).count()['ID'].reset_index()[['Age', 'ID']]
fig = go.Figure()
fig.update_layout(title_text = 'Movies Age Rating')
fig.add_trace(go.Funnel(
name = 'Netflix',
y = age_netflix.Age,
x = age_netflix['ID'],
textinfo = "value",
marker = {'color': 'red'}))
fig.add_trace(go.Funnel(
name = 'Prime',
orientation = 'h',
y = age_prime.Age,
x = age_prime['ID'],
textposition = 'inside',
textinfo = "value",
marker = {'color': 'deepskyblue'}))
fig.add_trace(go.Funnel(
name = 'Hulu',
orientation = 'h',
y = age_hulu.Age,
x = age_hulu['ID'],
textposition = 'inside',
textinfo = "value",
marker = {'color' : 'lime'}))
fig.add_trace(go.Funnel(
name = 'Disney+',
y = age_disney.Age,
x = age_disney['ID'],
textposition = 'outside',
textinfo = "value",
marker = {'color' : 'navy'}))
fig.show()
TV Shows Age Rating On Each Platform
#counting TV shows in each age category
age_n = tv[tv.Netflix == 1].groupby(['Age', 'Netflix']).count()['ID'].reset_index()[['Age', 'ID']]
age_h = tv[tv.Hulu == 1].groupby(['Age', 'Hulu']).count()['ID'].reset_index()[['Age', 'ID']]
age_p = tv[tv['Prime Video'] == 1].groupby(['Age', 'Prime Video']).count()['ID'].reset_index()[['Age', 'ID']]
age_d = tv[tv['Disney+'] == 1].groupby(['Age', 'Disney+']).count()['ID'].reset_index()[['Age', 'ID']]
#labels for pie charts since all 4 have same age groups
labels = age_n.Age
# Define color sets of age groups
colors = ['deeppink', 'blue', 'yellow', 'navy', 'green']
# Create subplots, using 'domain' type for pie charts
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]]
#fig = make_subplots(1,2, specs=specs,subplot_titles=['Netflix', 'Hulu', 'Prime', 'Disney+'])
fig = make_subplots(rows=2, cols=2, specs=specs, subplot_titles = ['Netflix', 'Hulu', 'Prime Video', 'Disney+'])
# Define pie charts
fig.add_trace(go.Pie(labels=labels, values=age_n['ID'], name='Netflix', hole = .3,
marker_colors=colors), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=age_h['ID'], name='Hulu', hole = .3,
marker_colors=colors), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=age_p['ID'], name='Prime Video', hole = .3,
marker_colors=colors), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=age_d['ID'], name='Disney+', hole = .3,
marker_colors=colors), 2, 2)
# Tune layout and hover info
fig.update_traces(hoverinfo='label+percent+name', textinfo='percent')
#title alignment
fig.update_layout(title={'text':'TV Shows Age Rating', 'y':0.98, 'x':0.5, 'xanchor': 'center','yanchor': 'top'})
#to increase pie chart size
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20, pad = 10))
#adding labels
fig.update(layout_showlegend=True)
fig = go.Figure(fig)
fig.show()
Movies Count by Genre
genre_n = mov[mov.Netflix == 1].groupby(['Genres', 'Netflix']).count()['ID'].reset_index()[['Genres', 'ID']]
genre_h = mov[mov.Hulu == 1].groupby(['Genres', 'Hulu']).count()['ID'].reset_index()[['Genres', 'ID']]
genre_p = mov[mov['Prime Video'] == 1].groupby(['Genres', 'Prime Video']).count()['ID'].reset_index()[['Genres', 'ID']]
genre_d = mov[mov['Disney+'] == 1].groupby(['Genres', 'Disney+']).count()['ID'].reset_index()[['Genres', 'ID']]
# seperating movies by streaming platforms
netflix = mov.loc[mov['Netflix'] == 1]
hulu = mov.loc[mov['Hulu']==1]
prime_video = mov.loc[mov['Prime Video']]
disney = mov.loc[mov['Disney+']]
# dropping columns of other platforms and unnecessary columns
netflix = netflix.drop(['Hulu', 'Prime Video', 'Disney+', 'Type'], axis = 1)
hulu = hulu.drop(['Netflix', 'Prime Video', 'Disney+', 'Type'], axis = 1)
prime = prime_video.drop(['Hulu', 'Netflix', 'Disney+', 'Type'], axis = 1)
disney = disney.drop(['Hulu', 'Prime Video', 'Netflix', 'Type'], axis = 1)
def genre(df,title):
genres_count = df.groupby('Genres', as_index = False).count()
genres_count = genres_count[['Genres', 'ID']].rename({'ID' : 'Count'}, axis = 'columns')
genres_count = genres_count.sort_values(by = 'Count', ascending = False)
fig = px.bar(genres_count.head(15), y='Genres', x="Count", color='Genres',
orientation="h", text = "Count", title=title)
fig.update_layout(
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
#title="Number of movies segmented by genre",
xaxis_tickfont_size=14,
yaxis=dict(
title='Movie genre',
titlefont_size=16,
tickfont_size=14
),
legend=dict(
x=1,
y=1.0,
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'),)
fig.show()
genre(netflix,'Netflix: Movie distribution by Genre')
genre(hulu,'Hulu: Movie distribution by Genre')
genre(prime,'Prime Video: Movie distribution by Genre')
genre(disney,'Disney+: Movie distribution by Genre')
Movies with Highest Rating on Each Platform by Genre
def treemap(df,platform,color,title):
df=df.loc[df[platform] == 1]
rate = df.sort_values(by='IMDb', ascending=False)
rate = rate[0:15]
rate['Movies']='Movies'
fig = px.treemap(rate, path=['IMDb','Title', 'Genres','Directors'],
color='IMDb',title = title, color_continuous_scale=color)#values='IMDb',
fig.show()
treemap(mov,'Netflix','reds','Netflix')
treemap(mov,'Hulu','greens','Hulu')
treemap(mov,'Prime Video','blues','Prime Video')
treemap(mov,'Disney+','bupu','Disney+')
TV Shows with Highest Rating on Each Platform
def sun_t(df,platform,color,title):
df=df.loc[df[platform] == 1]
df=df.sort_values(by='IMDb', ascending=False)
rate = df[0:15]
fig =px.sunburst(rate,path=['Title','Year'],values='IMDb',color='IMDb',color_continuous_scale=color,title=title)
fig.show()
sun_t(tv,'Netflix','hot', 'Netflix')
sun_t(tv,'Hulu','greens','Hulu')
sun_t(tv,'Prime Video','blues','Prime Video')
sun_t(tv,'Disney+','electric','Disney+')
Run Time of Movies with 9.3 IMDb Rating on all Platforms
runtime_top=mov.loc[mov['IMDb']==9.3][['Title','Runtime','IMDb']]
fig = px.bar(runtime_top, x='Title', y='Runtime', color='Runtime', height=500,
title='Runtime of Movies with 9.3 IMDb rating')
fig.show()
Movies and Directors
# Directors who directed the best and worst IMDb ranked movies
n = 10
x="Directors"
best = mov.groupby(by="Directors").mean().sort_values(by="IMDb",ascending=False).reset_index().iloc[:n]
worst = mov.groupby(by="Directors").mean().sort_values(by="IMDb",ascending=True).reset_index().iloc[2:n]
# For the worst IMDb average the worst two were dropped
#because they had an average of 0
fig = go.Figure(go.Funnelarea(
text = best.Directors,
values = best.IMDb,
textinfo='value+text',
showlegend=False,
title = f"Top {n} Directors with the Highest Average IMDb Movie Ratings Across Platforms",
))
#to increase pie chart size
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0,pad=0))
fig.update_layout(
font=dict(
family="Courier New, monospace",
size=25,
color="black"))
fig.show()
fig = go.Figure(go.Funnelarea(
text = worst.Directors,
values = worst.IMDb,
textinfo='value+text',
showlegend=False,
title = f"Top {n} Directors with the lowest Average IMDb Movie Ratings Across Platforms",
))
#to increase pie chart size
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0,pad=0))
fig.update_layout(
font=dict(
family="Courier New, monospace",
size=25,
color="black"))
fig.show()
Movie Availability
def Availability(title):
N_ava=netflix.loc[netflix['Title']==(title)]
P_ava=prime.loc[prime['Title']==(title)]
D_ava=disney.loc[disney['Title']==(title)]
H_ava=hulu.loc[hulu['Title']==(title)]
if (len(N_ava) >0):
print('It is available on Netflix! :)')
else:
print('Sorry, this movie is not Available on Netflix. :(')
if (len(P_ava) >0):
print('It is available on PrimeVideo! :)')
else:
print('Sorry, this movie not Available on PrimeVideo:(')
if (len(D_ava) >0):
print('It is available on Disney+! :)')
else:
print('Sorry, this movie is not Available on Disney+. :(')
if (len(H_ava) >0):
print('It is available on Hulu! :)')
else:
print('Sorry, this movie is not Available on Hulu. :(')
Input a movie title you want to watch
title='Inception'
Availability(title)
TV Shows Availability
# seperating tv shows by platforms
n = tv.loc[tv['Netflix'] == 1]
h = tv.loc[tv['Hulu']==1]
p = tv.loc[tv['Prime Video']]
d = tv.loc[tv['Disney+']]
def Availability(title):
N_a=n.loc[n['Title']==(title)]
P_a=p.loc[p['Title']==(title)]
D_a=d.loc[d['Title']==(title)]
H_a=h.loc[h['Title']==(title)]
if (len(N_a) >0):
print('It is available on Netflix! :)')
else:
print('Sorry, this TV Show is not Available on Netflix. :(')
if (len(P_a) >0):
print('It is available on PrimeVideo! :)')
else:
print('Sorry, this TV Show not Available on PrimeVideo:(')
if (len(D_a) >0):
print('It is available on Disney+! :)')
else:
print('Sorry, this TV show is not Available on Disney+. :(')
if (len(H_a) >0):
print('It is available on Hulu! :)')
else:
print('Sorry, this TV show is not Available on Hulu. :(')
Input a Tv Show title you want to watch
title='Breaking Bad'
Availability(title)
**5. Conclusion**
Github and Zendo Links
Learning Process
